This group project is about predicting the outbreak of west nile virus in Chicago, IL. This virus is transmitted to humans by mosquitos. Therefore, traps has been set to collect moquitos. The traps then were tested to see whether the collected mosquitos have the virus or not. The information gathered from the mosquito traps such as the type of the mosquito along with other data such as location, date, weather, and whether there had been an spray of pesticide to kill the mosquitos in that location are used to predict the presence of a west nile virus carrying mosquito in a trap.

The train set contains inforamtion about years 2007, 2009, 2011, and 2013 and is used to predict the presence of wnv in years of 2008, 2010, 2012, and 2014.



In [54]:

    
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np



In [55]:

    
#importing data, spray data for the test set (2008,2010,2012,2014) is not provided. Therefore, spray info is not used for the analysis
train = pd.read_csv('./assets/train.csv')
test =pd.read_csv('./assets/test.csv')
weather = pd.read_csv('./assets/weather.csv')
spray =pd.read_csv('./assets/spray.csv')
mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")



In [56]:

    
spray.head()



In [57]:

    
train[['Date','Trap','Species','Latitude','Longitude','NumMosquitos','WnvPresent']].head()









    Out[57]:






  
    
      
      Date
      Trap
      Species
      Latitude
      Longitude
      NumMosquitos
      WnvPresent
    
  
  
    
      0
      2007-05-29
      T002
      CULEX PIPIENS/RESTUANS
      41.954690
      -87.800991
      1
      0
    
    
      1
      2007-05-29
      T002
      CULEX RESTUANS
      41.954690
      -87.800991
      1
      0
    
    
      2
      2007-05-29
      T007
      CULEX RESTUANS
      41.994991
      -87.769279
      1
      0
    
    
      3
      2007-05-29
      T015
      CULEX PIPIENS/RESTUANS
      41.974089
      -87.824812
      1
      0
    
    
      4
      2007-05-29
      T015
      CULEX RESTUANS
      41.974089
      -87.824812
      4
      0



In [58]:

    
weather.head()









    Out[58]:






  
    
      
      Station
      Date
      Tmax
      Tmin
      Tavg
      Depart
      DewPoint
      WetBulb
      Heat
      Cool
      ...
      CodeSum
      Depth
      Water1
      SnowFall
      PrecipTotal
      StnPressure
      SeaLevel
      ResultSpeed
      ResultDir
      AvgSpeed
    
  
  
    
      0
      1
      2007-05-01
      83
      50
      67
      14
      51
      56
      0
      2
      ...
      
      0
      M
      0.0
      0.00
      29.10
      29.82
      1.7
      27
      9.2
    
    
      1
      2
      2007-05-01
      84
      52
      68
      M
      51
      57
      0
      3
      ...
      
      M
      M
      M
      0.00
      29.18
      29.82
      2.7
      25
      9.6
    
    
      2
      1
      2007-05-02
      59
      42
      51
      -3
      42
      47
      14
      0
      ...
      BR
      0
      M
      0.0
      0.00
      29.38
      30.09
      13.0
      4
      13.4
    
    
      3
      2
      2007-05-02
      60
      43
      52
      M
      42
      47
      13
      0
      ...
      BR HZ
      M
      M
      M
      0.00
      29.44
      30.08
      13.3
      2
      13.4
    
    
      4
      1
      2007-05-03
      66
      46
      56
      2
      40
      48
      9
      0
      ...
      
      0
      M
      0.0
      0.00
      29.39
      30.12
      11.7
      7
      11.9
    
  

5 rows × 22 columns



In [59]:

    
train.Trap.unique()









    Out[59]:





array(['T002', 'T007', 'T015', 'T045', 'T046', 'T048', 'T049', 'T050',
       'T054', 'T086', 'T091', 'T094', 'T096', 'T129', 'T143', 'T148',
       'T153', 'T159', 'T009', 'T011', 'T016', 'T019', 'T025', 'T028',
       'T031', 'T033', 'T089', 'T090', 'T092', 'T135', 'T141', 'T142',
       'T145', 'T146', 'T147', 'T149', 'T150', 'T151', 'T152', 'T154',
       'T158', 'T162', 'T218', 'T220', 'T001', 'T003', 'T006', 'T008',
       'T012', 'T034', 'T037', 'T040', 'T043', 'T047', 'T051', 'T085',
       'T088', 'T161', 'T219', 'T013', 'T014', 'T018', 'T030', 'T084',
       'T144', 'T160', 'T005', 'T017', 'T044', 'T095', 'T004', 'T035',
       'T036', 'T039', 'T060', 'T061', 'T062', 'T065', 'T066', 'T067',
       'T069', 'T070', 'T071', 'T073', 'T074', 'T075', 'T076', 'T077',
       'T079', 'T080', 'T081', 'T082', 'T083', 'T114', 'T155', 'T063',
       'T115', 'T138', 'T200', 'T206', 'T209', 'T212', 'T215', 'T107',
       'T128', 'T072', 'T078', 'T097', 'T099', 'T100', 'T102', 'T103',
       'T027', 'T156', 'T157', 'T221', 'T900', 'T903', 'T222', 'T223',
       'T225', 'T227', 'T224', 'T226', 'T229', 'T230', 'T228', 'T232',
       'T231', 'T235', 'T233', 'T236', 'T237', 'T238', 'T094B', 'T054C'], dtype=object)



In [61]:

    
# Number of mosquitos in traps with or without wnv 
ax = sns.boxplot( y="NumMosquitos",x="WnvPresent", data=train,showfliers=False)
plt.show()



In [10]:

    
# Given the number of mosquitos in a trap, what is the probability of finding the virus in that trap! 
number_of_mosquito_probability = train.groupby(pd.cut(train['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]



In [103]:

    
ax = sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
ax.set(xlabel="Probability of West Nile Virus", ylabel="Number of Mosquitoes")
plt.show()



In [63]:

    
# effect of mosquito type on the presence of wnv
mosquito_type = train.groupby(train['Species'], as_index = False).WnvPresent.sum()
ax = sns.barplot(x="WnvPresent", y="Species", data=mosquito_type)
ax.set(xlabel='Number of Traps')
plt.show()



In [64]:

    
spec1 = train[train['Species'] =='CULEX RESTUANS']
number_of_mosquito_probability = spec1.groupby(pd.cut(spec1['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]
sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
plt.show()



In [65]:

    
train.Species.unique()









    Out[65]:





array(['CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX PIPIENS',
       'CULEX SALINARIUS', 'CULEX TERRITANS', 'CULEX TARSALIS',
       'CULEX ERRATICUS'], dtype=object)



In [66]:

    
spec1 = train[train['Species'] =='CULEX PIPIENS/RESTUANS']
number_of_mosquito_probability = spec1.groupby(pd.cut(spec1['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]
sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
plt.show()



In [67]:

    
spec1 = train[train['Species'] =='CULEX PIPIENS']
number_of_mosquito_probability = spec1.groupby(pd.cut(spec1['NumMosquitos'], range(0, 50,5)), as_index = False).WnvPresent.mean()
number_of_mosquito_probability['NumMosquitos']=[5,10,15,20,25,30,35,40,45]
sns.barplot( x="NumMosquitos",y="WnvPresent", data=number_of_mosquito_probability)
plt.show()



In [68]:

    
#This is from kaggle website as an example of using maps (https://www.kaggle.com/dchudz/predict-west-nile-virus/where-are-the-measurement-points)
#%matplotlib inline

from sklearn.neighbors import KernelDensity

mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")
traps = train[['Date', 'Trap','Longitude', 'Latitude', 'WnvPresent']]

alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]
aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1)

sigthings = traps[traps['WnvPresent'] > 0]
sigthings = sigthings.groupby(['Date', 'Trap','Longitude', 'Latitude']).max()['WnvPresent'].reset_index()
X = sigthings[['Longitude', 'Latitude']].values
kd = KernelDensity(bandwidth=0.02)
kd.fit(X)

xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata, 
           cmap=plt.get_cmap('gray'), 
           extent=lon_lat_box, 
           aspect=aspect)
plt.imshow(zv, 
           origin='lower', 
           cmap=alpha_cm, 
           extent=lon_lat_box, 
           aspect=aspect)

locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')
plt.show()
#plt.savefig('heatmap.png')



In [69]:

    
#This is from kaggle website as an example of using maps (https://www.kaggle.com/dchudz/predict-west-nile-virus/where-are-the-measurement-points)
#%matplotlib inline

from sklearn.neighbors import KernelDensity

mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")
traps = spray[['Date','Longitude', 'Latitude']]
traps1 = train[['Longitude', 'Latitude']]
alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]
aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1)

#sigthings = traps
#sigthings = sigthings.groupby([ 'Longitude', 'Latitude']).max()['Date'].reset_index()
#X = sigthings[['Longitude', 'Latitude']].values
#kd = KernelDensity(bandwidth=0.02)
#kd.fit(X)

xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
#zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata, 
           cmap=plt.get_cmap('gray'), 
           extent=lon_lat_box, 
           aspect=aspect)
#plt.imshow(zv, 
 #          origin='lower', 
    #         cmap=alpha_cm, 
    #       extent=lon_lat_box, 
     #      aspect=aspect)

locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
loc =traps1[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')
plt.scatter(loc[:,0], loc[:,1], marker='o', color = 'red')
plt.show()
#plt.savefig('heatmap.png')



In [70]:

    
from sklearn.neighbors import KernelDensity

mapdata = np.loadtxt("./assets/mapdata_copyright_openstreetmap_contributors.txt")
traps = workingdata[['Longitude', 'Latitude']]
traps1 = data[['Longitude', 'Latitude']]
alpha_cm = plt.cm.Reds
alpha_cm._init()
alpha_cm._lut[:-3,-1] = abs(np.logspace(0, 1, alpha_cm.N) / 10 - 1)[::-1]
aspect = mapdata.shape[0] * 1.0 / mapdata.shape[1]
lon_lat_box = (-88, -87.5, 41.6, 42.1)

#sigthings = traps
#sigthings = sigthings.groupby([ 'Longitude', 'Latitude']).max()['Date'].reset_index()
#X = sigthings[['Longitude', 'Latitude']].values
#kd = KernelDensity(bandwidth=0.02)
#kd.fit(X)

xv,yv = np.meshgrid(np.linspace(-88, -87.5, 100), np.linspace(41.6, 42.1, 100))
gridpoints = np.array([xv.ravel(),yv.ravel()]).T
#zv = np.exp(kd.score_samples(gridpoints).reshape(100,100))
plt.figure(figsize=(10,14))
plt.imshow(mapdata, 
           cmap=plt.get_cmap('gray'), 
           extent=lon_lat_box, 
           aspect=aspect)
#plt.imshow(zv, 
 #          origin='lower', 
    #         cmap=alpha_cm, 
    #       extent=lon_lat_box, 
     #      aspect=aspect)

locations = traps[['Longitude', 'Latitude']].drop_duplicates().values
loc =traps1[['Longitude', 'Latitude']].drop_duplicates().values
plt.scatter(locations[:,0], locations[:,1], marker='x')
plt.scatter(loc[:,0], loc[:,1], marker='o', color = 'red')
plt.show()
#plt.savefig('heatmap.png')



In [71]:

    
trap = train[['Longitude', 'Latitude']].drop_duplicates()



In [72]:

    
workingdata= spray[['Longitude', 'Latitude']]

longi= np.round(workingdata.Longitude, 2)
lati =np.round(workingdata.Latitude, 2)
p=[]
g =[]

for i in range(len(longi)):

    p.append(float("{0:.2f}".format(longi[i])))# formatting to show only 4 decimals
    g.append(float("{0:.2f}".format(lati[i])))
longi =p
lati= g



In [73]:

    
trap = train[['Longitude', 'Latitude']]
trap =trap.drop_duplicates()
trap_longi= np.round(trap.Longitude, 2)
trap_lati =np.round(trap.Latitude, 2)
trap_longi =list(trap_longi)# from series to lists
trap_lati =list(trap_lati)
p=[]
g =[]
for i in range(len(trap_lati)):

    p.append(float("{0:.2f}".format(trap_longi[i])))
    g.append(float("{0:.2f}".format(trap_lati[i])))
trap_longi =p
trap_lati= g



In [74]:

    
f=[]
for i in range(len(trap_longi)):
    addi = 0.0
    for j in range(len(longi)):
        if (trap_longi[i] == longi[j]) and (trap_lati[i] == lati[j]):
            addi =addi +1
        else:
            addi =addi +0
    f.append(addi)



In [75]:

    
len(longi)









    Out[75]:





14835



In [76]:

    
print len(f)
sum(f)



In [77]:

    
trap['sprayed'] = f



In [78]:

    
data =trap[trap['sprayed'] > 1.0]



In [79]:

    
#EDA
print(train.shape)
print(train.head())
print(train.describe(include ='all'))
print(train.isnull().sum())
print(train.dtypes)









    



(10506, 12)
         Date                                            Address  \
0  2007-05-29  4100 North Oak Park Avenue, Chicago, IL 60634,...   
1  2007-05-29  4100 North Oak Park Avenue, Chicago, IL 60634,...   
2  2007-05-29  6200 North Mandell Avenue, Chicago, IL 60646, USA   
3  2007-05-29    7900 West Foster Avenue, Chicago, IL 60656, USA   
4  2007-05-29    7900 West Foster Avenue, Chicago, IL 60656, USA   

                  Species  Block           Street  Trap  \
0  CULEX PIPIENS/RESTUANS     41   N OAK PARK AVE  T002   
1          CULEX RESTUANS     41   N OAK PARK AVE  T002   
2          CULEX RESTUANS     62    N MANDELL AVE  T007   
3  CULEX PIPIENS/RESTUANS     79     W FOSTER AVE  T015   
4          CULEX RESTUANS     79     W FOSTER AVE  T015   

              AddressNumberAndStreet   Latitude  Longitude  AddressAccuracy  \
0  4100  N OAK PARK AVE, Chicago, IL  41.954690 -87.800991                9   
1  4100  N OAK PARK AVE, Chicago, IL  41.954690 -87.800991                9   
2   6200  N MANDELL AVE, Chicago, IL  41.994991 -87.769279                9   
3    7900  W FOSTER AVE, Chicago, IL  41.974089 -87.824812                8   
4    7900  W FOSTER AVE, Chicago, IL  41.974089 -87.824812                8   

   NumMosquitos  WnvPresent  
0             1           0  
1             1           0  
2             1           0  
3             1           0  
4             4           0  
              Date                                            Address  \
count        10506                                              10506   
unique          95                                                138   
top     2007-08-01  ORD Terminal 5, O'Hare International Airport, ...   
freq           551                                                750   
mean           NaN                                                NaN   
std            NaN                                                NaN   
min            NaN                                                NaN   
25%            NaN                                                NaN   
50%            NaN                                                NaN   
75%            NaN                                                NaN   
max            NaN                                                NaN   

                       Species         Block            Street   Trap  \
count                    10506  10506.000000             10506  10506   
unique                       7           NaN               128    136   
top     CULEX PIPIENS/RESTUANS           NaN   W OHARE AIRPORT   T900   
freq                      4752           NaN               750    750   
mean                       NaN     35.687797               NaN    NaN   
std                        NaN     24.339468               NaN    NaN   
min                        NaN     10.000000               NaN    NaN   
25%                        NaN     12.000000               NaN    NaN   
50%                        NaN     33.000000               NaN    NaN   
75%                        NaN     52.000000               NaN    NaN   
max                        NaN     98.000000               NaN    NaN   

                    AddressNumberAndStreet      Latitude     Longitude  \
count                                10506  10506.000000  10506.000000   
unique                                 138           NaN           NaN   
top     1000  W OHARE AIRPORT, Chicago, IL           NaN           NaN   
freq                                   750           NaN           NaN   
mean                                   NaN     41.841139    -87.699908   
std                                    NaN      0.112742      0.096514   
min                                    NaN     41.644612    -87.930995   
25%                                    NaN     41.732984    -87.760070   
50%                                    NaN     41.846283    -87.694991   
75%                                    NaN     41.954690    -87.627796   
max                                    NaN     42.017430    -87.531635   

        AddressAccuracy  NumMosquitos    WnvPresent  
count      10506.000000  10506.000000  10506.000000  
unique              NaN           NaN           NaN  
top                 NaN           NaN           NaN  
freq                NaN           NaN           NaN  
mean           7.819532     12.853512      0.052446  
std            1.452921     16.133816      0.222936  
min            3.000000      1.000000      0.000000  
25%            8.000000      2.000000      0.000000  
50%            8.000000      5.000000      0.000000  
75%            9.000000     17.000000      0.000000  
max            9.000000     50.000000      1.000000  
Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
NumMosquitos              0
WnvPresent                0
dtype: int64
Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
dtype: object



In [80]:

    
#EDA
#M = missing data
#T = trace precipitation
print(weather.shape)
print(weather.head())
print(weather.describe(include ='all'))
print(weather.isnull().sum())
print(weather.dtypes)









    



(2944, 22)
   Station        Date  Tmax  Tmin Tavg Depart  DewPoint WetBulb Heat Cool  \
0        1  2007-05-01    83    50   67     14        51      56    0    2   
1        2  2007-05-01    84    52   68      M        51      57    0    3   
2        1  2007-05-02    59    42   51     -3        42      47   14    0   
3        2  2007-05-02    60    43   52      M        42      47   13    0   
4        1  2007-05-03    66    46   56      2        40      48    9    0   

     ...    CodeSum Depth Water1 SnowFall PrecipTotal StnPressure SeaLevel  \
0    ...                0      M      0.0        0.00       29.10    29.82   
1    ...                M      M        M        0.00       29.18    29.82   
2    ...         BR     0      M      0.0        0.00       29.38    30.09   
3    ...      BR HZ     M      M        M        0.00       29.44    30.08   
4    ...                0      M      0.0        0.00       29.39    30.12   

  ResultSpeed ResultDir  AvgSpeed  
0         1.7        27       9.2  
1         2.7        25       9.6  
2        13.0         4      13.4  
3        13.3         2      13.4  
4        11.7         7      11.9  

[5 rows x 22 columns]
            Station        Date         Tmax         Tmin  Tavg Depart  \
count   2944.000000        2944  2944.000000  2944.000000  2944   2944   
unique          NaN        1472          NaN          NaN    60     42   
top             NaN  2013-06-02          NaN          NaN    73      M   
freq            NaN           2          NaN          NaN   138   1472   
mean       1.500000         NaN    76.166101    57.810462   NaN    NaN   
std        0.500085         NaN    11.461970    10.381939   NaN    NaN   
min        1.000000         NaN    41.000000    29.000000   NaN    NaN   
25%        1.000000         NaN    69.000000    50.000000   NaN    NaN   
50%        1.500000         NaN    78.000000    59.000000   NaN    NaN   
75%        2.000000         NaN    85.000000    66.000000   NaN    NaN   
max        2.000000         NaN   104.000000    83.000000   NaN    NaN   

           DewPoint WetBulb  Heat  Cool    ...    CodeSum Depth Water1  \
count   2944.000000    2944  2944  2944    ...       2944  2944   2944   
unique          NaN      48    31    31    ...         98     2      1   
top             NaN      63     0     0    ...                M      M   
freq            NaN     135  1870  1147    ...       1609  1472   2944   
mean      53.457880     NaN   NaN   NaN    ...        NaN   NaN    NaN   
std       10.675181     NaN   NaN   NaN    ...        NaN   NaN    NaN   
min       22.000000     NaN   NaN   NaN    ...        NaN   NaN    NaN   
25%       46.000000     NaN   NaN   NaN    ...        NaN   NaN    NaN   
50%       54.000000     NaN   NaN   NaN    ...        NaN   NaN    NaN   
75%       62.000000     NaN   NaN   NaN    ...        NaN   NaN    NaN   
max       75.000000     NaN   NaN   NaN    ...        NaN   NaN    NaN   

       SnowFall PrecipTotal StnPressure SeaLevel  ResultSpeed    ResultDir  \
count      2944        2944        2944     2944  2944.000000  2944.000000   
unique        4         168         104      102          NaN          NaN   
top           M        0.00       29.34    30.00          NaN          NaN   
freq       1472        1577         128       96          NaN          NaN   
mean        NaN         NaN         NaN      NaN     6.960666    17.494905   
std         NaN         NaN         NaN      NaN     3.587527    10.063609   
min         NaN         NaN         NaN      NaN     0.100000     1.000000   
25%         NaN         NaN         NaN      NaN     4.300000     7.000000   
50%         NaN         NaN         NaN      NaN     6.400000    19.000000   
75%         NaN         NaN         NaN      NaN     9.200000    25.000000   
max         NaN         NaN         NaN      NaN    24.100000    36.000000   

        AvgSpeed  
count       2944  
unique       178  
top          6.9  
freq          63  
mean         NaN  
std          NaN  
min          NaN  
25%          NaN  
50%          NaN  
75%          NaN  
max          NaN  

[11 rows x 22 columns]
Station        0
Date           0
Tmax           0
Tmin           0
Tavg           0
Depart         0
DewPoint       0
WetBulb        0
Heat           0
Cool           0
Sunrise        0
Sunset         0
CodeSum        0
Depth          0
Water1         0
SnowFall       0
PrecipTotal    0
StnPressure    0
SeaLevel       0
ResultSpeed    0
ResultDir      0
AvgSpeed       0
dtype: int64
Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg            object
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset          object
CodeSum         object
Depth           object
Water1          object
SnowFall        object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
dtype: object



In [81]:

    
weather.columns









    Out[81]:





Index([u'Station', u'Date', u'Tmax', u'Tmin', u'Tavg', u'Depart', u'DewPoint',
       u'WetBulb', u'Heat', u'Cool', u'Sunrise', u'Sunset', u'CodeSum',
       u'Depth', u'Water1', u'SnowFall', u'PrecipTotal', u'StnPressure',
       u'SeaLevel', u'ResultSpeed', u'ResultDir', u'AvgSpeed'],
      dtype='object')



In [82]:

    
#only working with the data from Station No. 1 and using Tavg, total precipitation, and wind speed as variables
weather = weather[weather['Station']== 1]
weather = weather[['Date','Tavg','PrecipTotal','ResultSpeed']]



In [83]:

    
# Adding the weather data to the train and test data
train_add =train.join(weather.set_index('Date'), on='Date')
test_add = test.join(weather.set_index('Date'), on='Date')



In [84]:

    
# reformatting  Tavg to float
train_add['Tavg']=train_add.Tavg.astype(float)
# assigning 0 instead of T and reformatting PrecipTotal to float
train_add['PrecipTotal']= np.where(train_add['PrecipTotal']=='  T', '0.0', train_add['PrecipTotal'])
train_add['PrecipTotal']= train_add.PrecipTotal.astype(float)



In [100]:

    
# Tavg in traps with or without wnv 
ax = sns.boxplot( y="Tavg",x="WnvPresent", data=train_add,showfliers=False)
ax.set(xlabel="Average Temperature", ylabel="West Nile Virus Present")
plt.show()



In [98]:

    
ax = sns.boxplot( y="PrecipTotal",x="WnvPresent", data=train_add, showfliers=False)
ax.set(xlabel="Precipitation Total", ylabel="West Nile Virus Present")
plt.show()



In [150]:

    
ax = sn.boxplot( y="ResultSpeed",x="WnvPresent", data=train_add)
plt.show()

Seems like wind spped, precipitation, and average temperature do not have a huge effect on wnv presence. However, location, number of mosquitos in the trap and the type of mosquitos have bigger effect.



In [248]:

    
from datetime import datetime
train['Date'] = pd.to_datetime(train['Date'], format='%Y-%m-%d')
test['Date'] = pd.to_datetime(test['Date'], format='%Y-%m-%d')
weather['Date'] = pd.to_datetime(weather['Date'], format='%Y-%m-%d')
spray['Date'] = pd.to_datetime(spray['Date'], format='%Y-%m-%d')



In [249]:

    
train['week'] = train['Date'].dt.weekofyear
train['year'] = train['Date'].dt.year
test['week'] = test['Date'].dt.weekofyear
test['year'] = test['Date'].dt.year
spray['year']=spray['Date'].dt.year
weather['year']=weather['Date'].dt.year



In [78]:

    
print test.columns
train.columns









    



Index([u'Id', u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'week', u'year'],
      dtype='object')






    Out[78]:





Index([u'Date', u'Address', u'Species', u'Block', u'Street', u'Trap',
       u'AddressNumberAndStreet', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent', u'week', u'year'],
      dtype='object')



In [6]:

    
weeks = train.groupby(['year','week'], as_index=False).WnvPresent.sum()



In [80]:

    
test.head()









    Out[80]:






  
    
      
      Id
      Date
      Address
      Species
      Block
      Street
      Trap
      AddressNumberAndStreet
      Latitude
      Longitude
      AddressAccuracy
      week
      year
    
  
  
    
      0
      1
      2008-06-11
      4100 North Oak Park Avenue, Chicago, IL 60634,...
      CULEX PIPIENS/RESTUANS
      41
      N OAK PARK AVE
      T002
      4100  N OAK PARK AVE, Chicago, IL
      41.95469
      -87.800991
      9
      24
      2008
    
    
      1
      2
      2008-06-11
      4100 North Oak Park Avenue, Chicago, IL 60634,...
      CULEX RESTUANS
      41
      N OAK PARK AVE
      T002
      4100  N OAK PARK AVE, Chicago, IL
      41.95469
      -87.800991
      9
      24
      2008
    
    
      2
      3
      2008-06-11
      4100 North Oak Park Avenue, Chicago, IL 60634,...
      CULEX PIPIENS
      41
      N OAK PARK AVE
      T002
      4100  N OAK PARK AVE, Chicago, IL
      41.95469
      -87.800991
      9
      24
      2008
    
    
      3
      4
      2008-06-11
      4100 North Oak Park Avenue, Chicago, IL 60634,...
      CULEX SALINARIUS
      41
      N OAK PARK AVE
      T002
      4100  N OAK PARK AVE, Chicago, IL
      41.95469
      -87.800991
      9
      24
      2008
    
    
      4
      5
      2008-06-11
      4100 North Oak Park Avenue, Chicago, IL 60634,...
      CULEX TERRITANS
      41
      N OAK PARK AVE
      T002
      4100  N OAK PARK AVE, Chicago, IL
      41.95469
      -87.800991
      9
      24
      2008



In [71]:

    
weeks.dtypes









    Out[71]:





year          int64
week          int64
WnvPresent    int64
index         int64
dtype: object



In [79]:

    
import seaborn as sns
#weeks['Datetime'] = df.index
ax=sns.lmplot(data = weeks, x='week',  y='WnvPresent',hue='year', fit_reg=False, palette="Set1",markers= '_' )
#plt.plot( weeks['week'],  weeks['WnvPresent'] , color = weeks['year'])
plt.show()



In [45]:

    
weeks['index'] =weeks.index
weeks.head()



In [307]:

    
weeks.year.unique()









    Out[307]:





array([2007, 2009, 2011, 2013])



In [309]:

    
fig = plt.figure(figsize=(7,4))
ax = fig.gca()
    
#ax = sns.distplot(ax=ax)
#proposal_xrange = np.linspace(np.min(data)-2, np.max(data)+2, 200)
ax.plot(weeks[weeks['year']== 2007].week, weeks[weeks['year']== 2007].WnvPresent , lw=2, color='darkred', alpha=0.1,
            label='2007')
    
ax.plot(weeks[weeks['year']== 2009].week, weeks[weeks['year']== 2009].WnvPresent , lw=2, color='darkred', alpha=0.3,
            label='2009')
ax.plot(weeks[weeks['year']== 2011].week, weeks[weeks['year']== 2011].WnvPresent , lw=2, color='darkred', alpha=0.5,
            label='2011 ')
ax.plot(weeks[weeks['year']== 2013].week, weeks[weeks['year']== 2013].WnvPresent , lw=2, color='darkred', alpha=1.0,
            label='2013 ')
    
ax.legend(loc='upper left')
plt.show()



In [312]:

    
w30 =pd.read_csv('weather_ave_30.csv')



In [313]:

    
w30.head()









    Out[313]:






  
    
      
      Date
      Tmax
      Tmin
      Tavg
      Depart
      DewPoint
      WetBulb
      Heat
      Cool
      PrecipTotal
      ...
      SQ
      FG+
      MI
      TS
      DZ
      RA
      BR
      FG
      SN
      Datetime_Date
    
  
  
    
      0
      2007-05-01
      83.0
      50.0
      67.0
      14.0
      51.000000
      56.000000
      0.0
      2.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.00
      0.000000
      0.0
      0.0
      2007-05-01
    
    
      2
      2007-05-02
      83.0
      42.0
      59.0
      -3.0
      46.500000
      51.500000
      14.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.00
      0.500000
      0.0
      0.0
      2007-05-02
    
    
      4
      2007-05-03
      83.0
      42.0
      58.0
      2.0
      44.333333
      50.333333
      14.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.00
      0.333333
      0.0
      0.0
      2007-05-03
    
    
      6
      2007-05-04
      83.0
      42.0
      58.0
      4.0
      43.500000
      50.250000
      14.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.25
      0.250000
      0.0
      0.0
      2007-05-04
    
    
      8
      2007-05-05
      83.0
      42.0
      58.4
      5.0
      42.400000
      50.000000
      14.0
      0.0
      0.0
      ...
      0.0
      0.0
      0.0
      0.0
      0.0
      0.20
      0.200000
      0.0
      0.0
      2007-05-05
    
  

5 rows × 29 columns



In [ ]:

Models



In [ ]:

    
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.cross_validation import cross_val_score, StratifiedKFold ,train_test_split, KFold
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier,AdaBoostClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler



In [ ]:

    
# model evaluation function
def evaluate_model(model):
    model.fit(X, y)
    y_pred = model.predict(X_test)
    
    a = accuracy_score(y_test, y_pred)
    probabilities = model.predict_proba(X_test)
    #cm = confusion_matrix(y_test, y_pred)
    conmat = np.array(confusion_matrix(y_test, y_pred, labels=[1,0]))

    confusion = pd.DataFrame(conmat, index=['1', '0'],
                         columns=['predicted_1','predicted_0'])

    cr = classification_report(y_test, y_pred)
    
    print 'confusion matrix:',confusion
    print 'classification_report:',cr
    print 'Accuracy of the model on test:',a
    return probabilities

Random Forest



In [ ]:

    
X=train['week','year',]



In [ ]:

    
##Grid search over Random Forest parameters
params = {'max_features ': [0.1,.5,1.0],'max_depth':[0.1,0.5,1.0],'n_estimators':[3, 5, 10, 50]}

gsrf = GridSearchCV(RandomForestClassifier(random_state = 33),
                     params, n_jobs=-1,
                     cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gsrf.fit(X, y)
print 'best parameters for the model:',gsrf.best_params_
print 'best score on train:',gsrf.best_score_
probability = evaluate_model(gsrf.best_estimator_)
#print 'model score on test:',score

KNN



In [ ]:

    
# Grid search over KNN parameters
params = {'n_neighbors': range(2,60)}

gsknn = GridSearchCV(KNeighborsClassifier(),
                     params, n_jobs=-1,
                     cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gsknn.fit(X, y)
print 'best parameters for the model:',gsknn.best_params_
print 'best score on train:',gsknn.best_score_
probability = evaluate_model(gsknn.best_estimator_)
#print 'model score on test:',score

Logestic Regression



In [ ]:

    
# Grid search over logestic regression parameters
params = {'penalty': ['l2','l1'], 'C':[0.1,1.0,10]}

gslr = GridSearchCV(LogisticRegression(solver = 'sag',random_state = 33),
                     params, n_jobs=-1,
                     cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gslr.fit(X, y)
print 'best parameters for the model:',gslr.best_params_
print 'best score on train:',gslr.best_score_
probability = evaluate_model(gslr.best_estimator_)
#print 'model score on test:',score

SVM



In [ ]:

    
scaled_X =StandardScaler(X)
scaled_X_test =StandardScaler(X_test)
# Grid search over support vector machine parameters
params = { 'C': [0.01, 0.1, 1.0, 10.0, 30.0, 100.0],
          'gamma': ['auto', 0.1, 1.0, 10.0],
          'kernel': ['linear', 'rbf']}

gssvc = GridSearchCV(SVC(random_state = 33),
                     params, n_jobs=-1,
                     cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gssvc.fit(scaled_X, y)
print 'best parameters for the model:',gssvc.best_params_
print 'best score on train:',gssvc.best_score_
probability = evaluate_model(gssvc.best_estimator_)#!!!!!!! check to used scaled vectors!
#print 'model score on test:',score

Gradiant Boosting



In [ ]:

    
##Grid search over Random Forest parameters
params = {'max_features ': [0.1,.5,1.0],'max_depth':[0.1,0.5,1.0],'n_estimators':[3, 5, 10, 50]}

gsgb = GridSearchCV(GradientBoostingClassifier(random_state = 33),
                     params, n_jobs=-1,
                     cv=StratifiedKFold(len(y), n_folds=3, shuffle=True))
gsgb.fit(X, y)

print 'best parameters for the model:',gsgb.best_params_
print 'best score on train:',gsgb.best_score_
probability = evaluate_model(gsgb.best_estimator_)
#print 'model score on test:',score



In [ ]:

    
### cross val score???



In [ ]:

    
#AUC 
y_score=[]
for i in range(0,len(probability)):
    y_score.append(probability[i][1])


FPR = dict()
TPR = dict()
ROC_AUC = dict()

# For class 1, find the area under the curve
FPR[1], TPR[1], _ = roc_curve(y_test, y_score)
ROC_AUC[1] = auc(FPR[1], TPR[1])

# Plot of a ROC curve for class 1 
plt.figure(figsize=[5,5])
plt.plot(FPR[1], TPR[1], label='ROC curve (area = %0.2f)' % ROC_AUC[1], linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', linewidth=4)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12)
plt.ylabel('True Positive Rate', fontsize=12)
plt.title('Receiver operating characteristic for salary ', fontsize=12)
plt.legend(loc="lower right")
plt.savefig('ROC_AUC.png')
plt.show()

	Date	Time	Latitude	Longitude
0	2011-08-29	6:56:58 PM	42.391623	-88.089163
1	2011-08-29	6:57:08 PM	42.391348	-88.089163
2	2011-08-29	6:57:18 PM	42.391022	-88.089157
3	2011-08-29	6:57:28 PM	42.390637	-88.089158
4	2011-08-29	6:57:38 PM	42.390410	-88.088858

	Date	Trap	Species	Latitude	Longitude	NumMosquitos
0	2007-05-29	T002	CULEX PIPIENS/RESTUANS	41.954690	-87.800991	1
1	2007-05-29	T002	CULEX RESTUANS	41.954690	-87.800991	1
2	2007-05-29	T007	CULEX RESTUANS	41.994991	-87.769279	1
3	2007-05-29	T015	CULEX PIPIENS/RESTUANS	41.974089	-87.824812	1
4	2007-05-29	T015	CULEX RESTUANS	41.974089	-87.824812	4

	Station	Date	Tmax	Tmin	Tavg	Depart	DewPoint	WetBulb	Heat	Cool	...	CodeSum	Depth	Water1	SnowFall	StnPressure	SeaLevel	ResultSpeed	ResultDir	AvgSpeed
0	1	2007-05-01	83	50	67	14	51	56	0	2	...		0	M	0.0	29.10	29.82	1.7	27	9.2
1	2	2007-05-01	84	52	68	M	51	57	0	3	...		M	M	M	29.18	29.82	2.7	25	9.6
2	1	2007-05-02	59	42	51	-3	42	47	14	0	...	BR	0	M	0.0	29.38	30.09	13.0	4	13.4
3	2	2007-05-02	60	43	52	M	42	47	13	0	...	BR HZ	M	M	M	29.44	30.08	13.3	2	13.4
4	1	2007-05-03	66	46	56	2	40	48	9	0	...		0	M	0.0	29.39	30.12	11.7	7	11.9

	Id	Date	Address	Species	Block	Street	Trap	AddressNumberAndStreet	Latitude	Longitude	AddressAccuracy	week	year
0	1	2008-06-11	4100 North Oak Park Avenue, Chicago, IL 60634,...	CULEX PIPIENS/RESTUANS	41	N OAK PARK AVE	T002	4100 N OAK PARK AVE, Chicago, IL	41.95469	-87.800991	9	24	2008
1	2	2008-06-11	4100 North Oak Park Avenue, Chicago, IL 60634,...	CULEX RESTUANS	41	N OAK PARK AVE	T002	4100 N OAK PARK AVE, Chicago, IL	41.95469	-87.800991	9	24	2008
2	3	2008-06-11	4100 North Oak Park Avenue, Chicago, IL 60634,...	CULEX PIPIENS	41	N OAK PARK AVE	T002	4100 N OAK PARK AVE, Chicago, IL	41.95469	-87.800991	9	24	2008
3	4	2008-06-11	4100 North Oak Park Avenue, Chicago, IL 60634,...	CULEX SALINARIUS	41	N OAK PARK AVE	T002	4100 N OAK PARK AVE, Chicago, IL	41.95469	-87.800991	9	24	2008
4	5	2008-06-11	4100 North Oak Park Avenue, Chicago, IL 60634,...	CULEX TERRITANS	41	N OAK PARK AVE	T002	4100 N OAK PARK AVE, Chicago, IL	41.95469	-87.800991	9	24	2008

	Date	Tmax	Tmin	Tavg	Depart	DewPoint	WetBulb	Heat	Cool	...	RA	BR	Datetime_Date
0	2007-05-01	83.0	50.0	67.0	14.0	51.000000	56.000000	0.0	2.0	...	0.00	0.000000	2007-05-01
2	2007-05-02	83.0	42.0	59.0	-3.0	46.500000	51.500000	14.0	0.0	...	0.00	0.500000	2007-05-02
4	2007-05-03	83.0	42.0	58.0	2.0	44.333333	50.333333	14.0	0.0	...	0.00	0.333333	2007-05-03
6	2007-05-04	83.0	42.0	58.0	4.0	43.500000	50.250000	14.0	0.0	...	0.25	0.250000	2007-05-04
8	2007-05-05	83.0	42.0	58.4	5.0	42.400000	50.000000	14.0	0.0	...	0.20	0.200000	2007-05-05